library(ggplot2)
Want to understand how all the pieces fit together? Read R for Data Science: https://r4ds.had.co.nz/
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(gridExtra)
Attaching package: ‘gridExtra’
The following object is masked from ‘package:dplyr’:
combine
library(tidyr)
library(reshape2)
Attaching package: ‘reshape2’
The following object is masked from ‘package:tidyr’:
smiths
library(RColorBrewer)
library(ggrepel)
library(ggthemes)
library(GGally)
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
library(plotly)
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
df = read.csv('pokemon.csv')
head(df)
cat("Number of instances : ",nrow(df))
Number of instances : 800
cat("\nNumber of attributes : ",ncol(df))
Number of attributes : 12
str(df1)
'data.frame': 18 obs. of 2 variables:
$ unique.df.Type.1.: chr "Grass" "Fire" "Water" "Bug" ...
$ HP : num 60 65 80 60 78 70 70 79 59.5 65.5 ...
summary(df1)
unique.df.Type.1. HP
Length:18 Min. :59.50
Class :character 1st Qu.:65.12
Mode :character Median :69.25
Mean :68.67
3rd Qu.:70.00
Max. :80.00
for ( col in 1:ncol(df)) {
cat(colnames(df)[col])
cat(" --> ")
print(sum(is.na(df[,col])))
}
Name --> [1] 0
Type.1 --> [1] 0
Type.2 --> [1] 0
Total --> [1] 0
HP --> [1] 0
Attack --> [1] 0
Defense --> [1] 0
Sp..Atk --> [1] 0
Sp..Def --> [1] 0
Speed --> [1] 0
Generation --> [1] 0
Legendary --> [1] 0
From the above observation, it seems that there are no missing values. But there are more empty values(i.e missing values) in the data-set
Converting the empty values in missing values
df[df == ""] <- NA
for ( col in 1:ncol(df)) {
cat(colnames(df)[col])
cat(" --> ")
print(sum(is.na(df[,col])))
}
Name --> [1] 0
Type.1 --> [1] 0
Type.2 --> [1] 386
Total --> [1] 0
HP --> [1] 0
Attack --> [1] 0
Defense --> [1] 0
Sp..Atk --> [1] 0
Sp..Def --> [1] 0
Speed --> [1] 0
Generation --> [1] 0
Legendary --> [1] 0
print("Percentage of Missing values in Type.2 attribute")
[1] "Percentage of Missing values in Type.2 attribute"
print((abs(nrow(df) - sum(is.na(df['Type.2'])))) / nrow(df))
[1] 0.5175
There are 51% missing values in ‘Type-2’ attribute
ggplot(df,
aes(x = Type.1, fill = Legendary)) +
geom_bar(position = "stack", color='black') +
labs(x = "Type-1", y = "Count", title = "Contribution of Type-1 and Legendary") +
coord_flip()
Number of Pokemon by Type-1
df %>%
group_by(Type.1) %>% summarise(number = n()) %>%
ggplot(aes(x = reorder(Type.1, number), y = number , fill = Type.1)) +
geom_bar(stat = 'identity', color='black') +
labs(x = "Type-1 of Pokemon", y = "Number of Pokemon", title = "Number of Pokemon by Type-1") +
coord_flip() + geom_text(aes(label = number), hjust = -1.0)
Number of Pokemon by Type-2
df %>%filter(Type.2 != '') %>% group_by(Type.2) %>% summarise(number = n()) %>%
ggplot(aes(x = reorder(Type.2, number), y = number , fill = Type.2)) +
geom_bar(stat = 'identity', color='black') +
labs(x = "Type-2 of Pokemon", y = "Number of Pokemon", title = "Number of Pokemon by Type-2") +
coord_flip() + geom_text(aes(label = number), hjust = -1.0)
Pokemons with higher attack ratings are faster.
ggplot(df, aes(Attack, Defense)) + geom_jitter(aes(col=Speed)) +
scale_color_gradient(low="blue", high="darkorange") +
ggtitle("Defense vs Attack wrt Speed")
ggpairs(df, columns = c('Attack', 'Defense', 'HP', 'Sp..Atk', 'Sp..Def', 'Speed')) +
theme_bw() +labs(title = 'Correlation Matrix of Pokemon Stats')
plot: [1,1] [==>-------------------------------------------------------------------------------------------] 3% est: 0s
plot: [1,2] [====>-----------------------------------------------------------------------------------------] 6% est: 0s
plot: [1,3] [=======>--------------------------------------------------------------------------------------] 8% est: 1s
plot: [1,4] [=========>------------------------------------------------------------------------------------] 11% est: 1s
plot: [1,5] [============>---------------------------------------------------------------------------------] 14% est: 1s
plot: [1,6] [===============>------------------------------------------------------------------------------] 17% est: 1s
plot: [2,1] [=================>----------------------------------------------------------------------------] 19% est: 1s
plot: [2,2] [====================>-------------------------------------------------------------------------] 22% est: 1s
plot: [2,3] [=======================>----------------------------------------------------------------------] 25% est: 1s
plot: [2,4] [=========================>--------------------------------------------------------------------] 28% est: 1s
plot: [2,5] [============================>-----------------------------------------------------------------] 31% est: 0s
plot: [2,6] [==============================>---------------------------------------------------------------] 33% est: 0s
plot: [3,1] [=================================>------------------------------------------------------------] 36% est: 0s
plot: [3,2] [====================================>---------------------------------------------------------] 39% est: 0s
plot: [3,3] [======================================>-------------------------------------------------------] 42% est: 0s
plot: [3,4] [=========================================>----------------------------------------------------] 44% est: 0s
plot: [3,5] [===========================================>--------------------------------------------------] 47% est: 0s
plot: [3,6] [==============================================>-----------------------------------------------] 50% est: 0s
plot: [4,1] [=================================================>--------------------------------------------] 53% est: 0s
plot: [4,2] [===================================================>------------------------------------------] 56% est: 0s
plot: [4,3] [======================================================>---------------------------------------] 58% est: 0s
plot: [4,4] [========================================================>-------------------------------------] 61% est: 0s
plot: [4,5] [===========================================================>----------------------------------] 64% est: 0s
plot: [4,6] [==============================================================>-------------------------------] 67% est: 0s
plot: [5,1] [================================================================>-----------------------------] 69% est: 0s
plot: [5,2] [===================================================================>--------------------------] 72% est: 0s
plot: [5,3] [=====================================================================>------------------------] 75% est: 0s
plot: [5,4] [========================================================================>---------------------] 78% est: 0s
plot: [5,5] [===========================================================================>------------------] 81% est: 0s
plot: [5,6] [=============================================================================>----------------] 83% est: 0s
plot: [6,1] [================================================================================>-------------] 86% est: 0s
plot: [6,2] [===================================================================================>----------] 89% est: 0s
plot: [6,3] [=====================================================================================>--------] 92% est: 0s
plot: [6,4] [========================================================================================>-----] 94% est: 0s
plot: [6,5] [==========================================================================================>---] 97% est: 0s
plot: [6,6] [==============================================================================================]100% est: 0s
density_hp = ggplot(data=df, aes(HP)) +
geom_density(col="white",fill="pink", alpha=0.8) +
ggtitle("Density Plot of HP")
density_speed = ggplot(data=df, aes(Speed)) +
geom_density(col="white", fill="darkorchid", alpha=0.8) +
ggtitle("Density Plot of Speed Characterstics")
density_attack = ggplot(data=df, aes(Attack)) +
geom_density(col="white", fill="orange", alpha=0.7) +
ggtitle("Density Plot of Attack Characterstics")
density_defense = ggplot(data=df, aes(Defense)) +
geom_density(col="white", fill="firebrick", alpha=0.7) +
ggtitle("Density Plot of Defense Characterstics")
grid.arrange(density_hp, density_speed, density_attack, density_defense, ncol=2)
Score of Pokemon by generation
## HP(Highest Power) --> Key
## Speed --> Value
df %>% gather(key, value, HP:Speed) %>% ggplot(aes(x = Generation, y = value, fill = as.factor(Generation))) +
geom_boxplot() + facet_grid(~key) + labs(x="Generation", y="Score", title="Various score based on Generation flag")
Score of Pokemon by Legendary type
## HP(Highest Power) --> Key
## Speed --> Value
df %>% gather(key, value, HP:Speed) %>% ggplot(aes(x=Legendary, y=value, fill=as.factor(Legendary))) +
geom_boxplot() + facet_grid(~key) + labs(x="Lengendry", y="Score", title="Various score based on Lengendry flag")
df1 = data.frame(unique(df$Type.1),aggregate(df,by=list(df$Type.1),FUN=median)["HP"])
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]) :
argument is not numeric or logical: returning NA
plot_ly(df1,labels=~unique.df.Type.1.,values =~HP,type ="pie")